# -*- python -*-
#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
#$Id$
"""This module provides a class :class:`~KEGG` to access to the
REST KEGG interface. There are additional methods and functionalities added by
**BioServices**.

.. note:: a previous imterface to the KEGG WSDL service was designed but the
    WSDL closed in  Dec 2012.

.. topic:: What is KEGG ?

    :URL: http://www.kegg.jp/
    :REST: http://www.kegg.jp/kegg/rest/keggapi.html
    :weblink: http://www.genome.jp/kegg/rest/weblink.html
    :dbentries: http://www.genome.jp/kegg/rest/dbentry.html

    .. highlights::

        "KEGG is a database resource for understanding high-level functions and
        utilities of the biological system, such as the cell, the organism and the
        ecosystem, from molecular-level information, especially large-scale molecular
        datasets generated by genome sequencing and other high-throughput experimental
        technologies (See Release notes for new and updated features). "

        -- KEGG home page, Jan 2013

.. _terminology:

Some terminology
--------------------

The following list is a simplified list of terminology taken from KEGG API
pages.


* organisms (**org**) are made of a three-letter (or four-letter) code (e.g.,
  **hsa** stands for Human Sapiens) used in KEGG (see  :attr:`~bioservices.kegg.KEGG.organismIds`).
* **db** is a database name. See :attr:`~bioservices.kegg.KEGG.databases`
  attribute and :ref:`kegg_database` section.
* **entry_id** is a unique identifier. It is a combination of the database name
  and the identifier of an entry joined by a colon sign (e.g. 'embl:J00231').

  Here are some examples of entry Ids:

    * **genes_id**: A KEGG organism and a gene name (e.g. 'eco:b0001').
    * **enzyme_id**: 'ec' and an enzyme code. (e.g. 'ec:1.1.1.1'). 
      See :attr:`~bioservices.kegg.KEGG.enzymeIds`.
    * **compound_id**: 'cpd' and a compound number (e.g. 'cpd:C00158').
      Some compounds also have 'glycan_id' and
      both IDs are accepted and converted internally.
      See :attr:`~bioservices.kegg.KEGG.compoundIds`.
    * **drug_id**: 'dr' and a drug number (e.g. 'dr:D00201'). See
      :attr:`~bioservices.kegg.KEGG.drugIds`.
    * **glycan_id**: 'gl' and a glycan number (e.g.
    * 'gl:G00050'). Some glycans also have 'compound_id' and both
      IDs are accepted and converted internally. see
      :attr:`~bioservices.kegg.KEGG.glycanIds` attribute.
    * **reaction_id**:  'rn' and a reaction number (e.g.
    * 'rn:R00959' is a reaction which catalyze cpd:C00103 into cpd:C00668).
      See :attr:`~bioservices.kegg.KEGG.reactionIds` attribute.
    * **pathway_id**: 'path' and a pathway number. Pathway numbers prefixed
      by 'map' specify the reference pathway and pathways prefixed by
      a KEGG organism specify pathways specific to the organism (e.g.
      'path:map00020', 'path:eco00020'). 
      See :attr:`~bioservices.kegg.KEGG.pathwayIds` attribute.
    * **motif_id**: a motif database names ('ps' for prosite, 'bl' for blocks,
      'pr' for prints, 'pd' for prodom, and 'pf' for pfam) and a motif entry
      name. (e.g. 'pf:DnaJ' means a Pfam  database entry 'DnaJ').
    * **ko_id**: identifier made of 'ko' and a ko number (e.g. 'ko:K02598').
      See :attr:`~bioservices.kegg.KEGG.koIds` attribute.


.. _kegg_database:

KEGG Databases Names and Abbreviations
-------------------------------------------

Here is a list of databases used in KEGG API with their name and abbreviation:

=============== =========== ==========
Database Name   Abbrev      kid
=============== =========== ==========
pathway         path        map number
brite           br          br number
module          md          M number
disease         ds          H number
drug            dr          D number
environ         ev          E number
orthology       ko          K number
genome          genome      T number
genomes         gn          T number
genes           -           -
ligand          ligand      -
compound        cpd         C number
glycan          gl          G number
reaction        rn          R number
rpair           rp          RP number
rclass          rc          RC number
enzyme          ec          -
=============== =========== ==========


.. _db_entries:

Database Entries
-------------------

Database entries can be written in on of the following ways::

    <dbentries> = <dbentry>1[+<dbentry>2...]
    <dbentry> = <db:entry> | <kid> | <org:gene>

Each database entry is identified by::

    db:entry

where "db" is the database name or its abbreviation shown above and
"entry" is the entry name or the accession number that is uniquely
assigned within the database. In reality "db" may be omitted, for
the entry name called the KEGG object identifier (kid) is unique
across KEGG.::

    kid = database-dependent prefix + five-digit number

In the KEGG GENES database the db:entry combination must be specified. This is
more specifically written as::

    org:gene

where "org" is the three- or four-letter KEGG organism code or
the T number genome identifier and "gene" is the gene identifier,
usually locus_tag or ncbi GeneID, or the primary gene name.


"""
from __future__ import print_function
from __future__ import unicode_literals
try:
    from functools import reduce # python3 compat
except:
    pass
from bioservices.services import REST, BioServicesError
import webbrowser
import copy
from bioservices import logger
logger.name = __name__


from easydev.logging_tools import Logging

__all__ = ["KEGG",  "KEGGParser"]


class KEGG(REST):
    """Interface to the `KEGG <http://www.genome.jp/kegg/pathway.html>`_ service

    This class provides an interface to the KEGG REST API. The weblink tools
    are partially accesible. All dbentries can be parsed into dictionaries using
    the :class:`KEGGParser`

    Here are some examples. In order to retrieve the entry of the 
    gene identifier 7535 of the **hsa** organism, type::

        from bioservices import KEGG
        s = KEGG()
        print(s.get("hsa:7535"))

    The output is the raw ouput sent by KEGG API. See :class:`KEGGParser` to
    parse this output.

    .. seealso:: The :ref:`db_entries` to know more about the db entries format.

    Another example here below shows how to print the list of pathways of
    the human organism::

        print(s.list("pathway", organism="hsa"))

    Further post processing would allow you to retrieve the pathway Ids. However,
    we provide additional functions to the KEGG API so the previous code and post
    processing to extract the pathway Ids can be written as::

        s.organism = "hsa"
        s.pathwayIds

    and similarly you can get all :meth:`databases` output and database Ids easily.
    For example, for the reaction database::

        s.reaction   # equivalent to s.list("reaction")
        s.reactionIds

    Other methods of interest are :meth:`conv`, :meth:`find`, :meth:`get`.

    .. seealso:: :ref:`kegg_database`, :ref:`db_entries`, :ref:`terminology`.

    """

    #: valid databases
    _valid_DB_base = ["module", "disease", "drug","environ", "ko",
        "genome", "compound", "glycan", "reaction", "rpair", "rclass",
        "enzyme"]
    _valid_DB = _valid_DB_base + ["pathway", "brite", "genes", "ligand", \
        "organism", "genomes", "orthology"]
    _valid_databases_info = _valid_DB_base + ["pathway", "brite", "genes",\
        "ligand", "genomes", "kegg"]
    _valid_databases_list = _valid_DB_base + ["pathway", "brite", "organism"]
    _valid_databases_find = _valid_DB_base + ["pathway", "genes", "ligand"]
    _valid_databases_link = _valid_DB_base + ["pathway", "brite"]

    _docIds = "\n\n.. seealso:: :meth:`list`\n"

    def __init__(self, verbose=False, cache=False):
        """.. rubric:: Constructor

        :param bool verbose: prints informative messages

        """
        super(KEGG, self).__init__(name="KEGG", url="http://rest.kegg.jp",
                verbose=verbose, cache=cache)
        self.easyXMLConversion = False
        self._organism = None

        self._organisms = None
        self._organisms_tnumbers = None
        self._pathway = None
        self._glycan = None
        self._compound = None
        self._ko = None
        self._enzyme = None
        self._reaction = None
        self._drug = None
        self._brite = None

        self.keggParser = KEGGParser(verbose=verbose)

    # we could use this to retrieve all databases Ids but
    def __getattr__(self, req):
        """ for x in s._valid_databases_list: print len(getattr(s, x))"""
        if req.endswith("Ids"):
            db = req[0:-3]
            res = self.list(db)
            if db in ["",""]:
                Ids = [x.split()[1] for x in res.split("\n") if len(x)]
            else:
                Ids = [x.split()[0] for x in res.split("\n") if len(x)]
            return Ids
        elif req in self.databases:
            res = self.list(req)
            return res

    def code2Tnumber(self, code):
        """Converts organism code to its T number

        .. doctest::

            >>> from bioservices import KEGG
            >>> s = KEGG()
            >>> s.code2Tnumber("hsa")
            'T01001'
        """
        index = self.organismIds.index(code)
        return self.organismTnumbers[index]

    def Tnumber2code(self, Tnumber):
        """Converts organism T number to its code

        .. doctest::

            >>> from bioservices import KEGG
            >>> s = KEGG()
            >>> s.Tnumber2code("T01001")
            'hsa'
        """
        index = self.organismTnumbers.index(Tnumber)
        return self.organismIds[index]

    def isOrganism(self, org):
        """Checks if org is a KEGG organism

        :param str org:
        :return: True if org is in the KEGG organism list (code or Tnumber)

        ::

            >>> from bioservices import KEGG
            >>> s = KEGG()
            >>> s.isOrganism("hsa")
            True

        """
        if org in self.organismIds:
            return True
        if org in self.organismTnumbers:
            return True
        else:
            return False

    def _checkDB(self, database=None, mode=None):
        self.logging.info("checking database %s (mode=%s)" % (database, mode))
        isOrg = self.isOrganism(database)

        if mode == "info":
            if database not in KEGG._valid_databases_info and isOrg is False:
                self.logging.error("database or organism provided is not correct (mode=info)")
                raise
        elif mode == "list":
            if database not in KEGG._valid_databases_list and isOrg is False:
                self.logging.error("database provided is not correct (mode=list)")
                raise
        elif mode == "find":
            if database not in KEGG._valid_databases_find and isOrg is False:
                self.logging.error("database provided is not correct (mode=find)")
                raise
        elif mode == "link":
            if database not in KEGG._valid_databases_link and isOrg is False:
                self.logging.error("database provided is not correct (mode=link)")
                raise

        else:
            raise ValueError("mode can be only info, list, ")

    def dbinfo(self, database="kegg"):
        """Displays the current statistics of a given database

        :param str database: can be one of: kegg (default), brite, module,
            disease, drug, environ, ko, genome, compound, glycan, reaction,
            rpair, rclass, enzyme, genomes, genes, ligand or any
            :attr:`organismIds`.

        ::

            from bioservices import KEGG
            s = KEGG()
            s.dbinfo("hsa") # human organism
            s.dbinfo("T01001") # same as above
            s.dbinfo("pathway")

        .. versionchanged:: 1.4.1 renamed info method into :meth:`dbinfo`, 
            which clashes with Logging framework info() method.
        """
        self._checkDB(database, mode="info")
        res = self.http_get('info/' + database, frmt="txt")
        return res

    def list(self, query, organism=None):
        """Returns a list of entry identifiers and associated definition for a given database or a given set of database entries

        :param str query: can be one of pathway, brite, module,
            disease, drug, environ, ko, genome, compound,
            glycan, reaction, rpair, rclass, enzyme, organism
            **or** an organism from the :attr:`organismIds` attribute **or** a valid
            dbentry (see below). If a dbentry query is provided, organism
            should not be used!
        :param str organism: a valid organism identifier that can be
            provided. If so, database can be only "pathway" or "module". If
            not provided, the default value is chosen (:attr:`organism`)
        :return: A string with a structure that depends on the query



        Here is an example that shows how to extract the pathways IDs related to
        the hsa organism::

            >>> s = KEGG()
            >>> res = s.list("pathway", organism="hsa")
            >>> pathways = [x.split()[0] for x in res.strip().split("\\n")]
            >>> len(pathways)  # as of Dec 2012
            261

        Note, however, that there are convenient aliases to some of the databases.
        For instance, the pathway Ids can also be retrieved as a list from the
        :attr:`pathwayIds` attribute (after defining the :attr:`organism` attribute).

        .. note:: If you set the query to a valid organism, then the second
               argument rganism is irrelevant and ignored.

        .. note:: If the query is not a database or an organism, it is supposed
            to be a valid dbentries string and the maximum number of entries is 100.

        Other examples::

            s.list("pathway")             # returns the list of reference pathways
            s.list("pathway", "hsa")      # returns the list of human pathways
            s.list("organism")            # returns the list of KEGG organisms with taxonomic classification
            s.list("hsa")                 # returns the entire list of human genes
            s.list("T01001")              # same as above
            s.list("hsa:10458+ece:Z5100") # returns the list of a human gene and an E.coli O157 gene
            s.list("cpd:C01290+gl:G00092")# returns the list of a compound entry and a glycan entry
            s.list("C01290+G00092")       # same as above
        """
        url = "list"
        if query:
            #can be something else than a database so we can not use checkDB
            #self._checkDB(database, mode="list")
            url += "/" + query

        if organism:
            if organism not in self.organismIds:
                self.logging.error("""Invalid organism provided (%s). See the organismIds attribute""" % organism)
                raise BioServicesError("Not a valid organism")
            if query not in ["pathway", "module"]:
                self.logging.error("""
    If organism is set, then the first argument
    (database) must be either 'pathway' or 'module'. You provided %s""" % query)
                raise
            url += "/" + organism

        res = self.http_get(url, "txt")
        return res

    def find(self, database, query, option=None):
        """finds entries with matching query keywords or other query data in a given database

        :param str database: can be one of pathway, module, disease, drug,
            environ, ko, genome, compound, glycan, reaction, rpair, rclass,
            enzyme, genes, ligand or an organism code (see :attr:`organismIds`
            attributes) or T number (see :attr:`organismTnumbers` attribute).
        :param str query: See examples
        :param str option: If option provided, database can be only 'compound'
            or 'drug'. Option can be 'formula', 'exact_mass' or 'mol_weight'


        .. note:: Keyword search against brite is not supported. Use /list/brite to
            retrieve a short list.

        ::

            # search for pathways that contain Viral in the definition
            s.find("pathway", "Viral")
            # for keywords "shiga" and "toxin"
            s.find("genes", "shiga+toxin")
            # for keywords "shiga toxin"
            s.find("genes", ""shiga toxin")
            # for chemical formula "C7H10O5"
            s.find("compound", "C7H10O5", "formula")
            # for chemical formula containing "O5" and "C7"
            s.find("compound", "O5C7","formula")
            # for 174.045 =< exact mass < 174.055
            s.find("compound", "174.05","exact_mass")
            # for 300 =< molecular weight =< 310
            s.find("compound", "300-310","mol_weight")

        """
        _valid_options = ["formula", "exact_mass", "mol_weight"]
        _valid_db_options = ["compound", "drug"]

        self._checkDB(database, mode="find")
        url = "find/"+ database + "/" +  query

        if option:
            if database not in _valid_db_options:
                raise ValueError("invalid database. Since option was provided, database must be in %s" % _valid_db_options)
            if option not in _valid_options:
                raise ValueError("invalid option. Must be in %s " % _valid_options)
            url +=  "/" + option

        res = self.http_get(url, frmt="txt")
        return res

    def show_entry(self, entry):
        """Opens URL corresponding to a valid entry

        ::

            s.www_bget("path:hsa05416")

        """
        url = "http://www.kegg.jp/dbget-bin/www_bget?" + entry
        self.logging.info(url)
        webbrowser.open(url)

    def get(self, dbentries, option=None, parse=False):
        """Retrieves given database entries

        :param str dbentries: KEGG database entries involving the following
            database: pathway, brite, module, disease, drug, environ, ko, genome
            compound, glycan,  reaction, rpair, rclass, enzyme **or** any organism
            using the KEGG organism code (see :attr:`organismIds`
            attributes) or T number (see :attr:`organismTnumbers` attribute).
        :param str option: one of: aaseq, ntseq, mol, kcf, image, kgml

      .. note:: you can add the option at the end of dbentries in which case
            the parameter option must not be used (see example)

        ::

            from bioservices import KEGG
            s = KEGG()
            # retrieves a compound entry and a glycan entry
            s.get("cpd:C01290+gl:G00092")
            # same as above
            s.get("C01290+G00092")
            # retrieves a human gene entry and an E.coli O157 gene entry
            s.get("hsa:10458+ece:Z5100")
            # retrieves amino acid sequences of a human gene and an E.coli O157 gene
            s.get("hsa:10458+ece:Z5100/aaseq")
            # retrieves the image file of a pathway map
            s.get("hsa05130/image")
            # same as above
            s.get("hsa05130", "image")


        Another example here below shows how to save the image of a given pathway::

           res =  s.get("hsa05130/image")
            # same as : res =  s.get("hsa05130","image")
            f = open("test.png", "w")
            f.write(res)
            f.close()

        .. note::  The input is limited up to 10 entries (KEGG restriction).
        """
        _valid_options = ["aaseq", "ntseq", "mol", "kcf", "image", "kgml"]
        _valid_db_options = ["compound", "drug"]

        #self._checkDB(database, mode="find")
        url = "get/"+ dbentries

        if option:
            if option not in _valid_options:
                raise ValueError("invalid option. Must be in %s " % _valid_options)
            url +=  "/" + option

        res = self.http_get(url, frmt="txt")

        if parse is True:
            res = self.parse(res)

        return res

    def conv(self, target, source):
        """convert KEGG identifiers to/from outside identifiers

        :param str target: the target database (e.g., a KEGG organism).
        :param str source: the source database (e.g., uniprot) or a valid
            dbentries; see below for details.

        :return: a dictionary with keys being the source and values being the target.

        Here are the rules to set the target and source parameters.

        If the second argument is not a **dbentries**, source and target
        parameters can be of two types:

            #. gene identifiers. If the target is a KEGG Id, then the source
               must be one of *ncbi-gi*, *ncbi-geneid* or *uniprot*.

               .. note:: source and target can be swapped.
            #. chemical substance identifiers. If the target is one of the
               following kegg database: drug, compound, glycan then the source
               must be one of *pubchem* or *chebi*.

               .. note:: again, source and target can be swapped

        If the second argument is a **dbentries**, it can be again of two types:

            #. gene identifiers. The database used can be one ncbi-gi,
               ncbi-geneid, uniprot or any KEGG organism
            #. chemical substance identifiers. The database used can be one of
               drug, compound, glycan, pubchem or chebi only.

        .. note:: if the second argument is a dbentries, target and dbentries
            cannot be swapped.

        ::

            # conversion from NCBI GeneID to KEGG ID for E. coli genes
            conv("eco","ncbi-geneid")
            # inverse of the above example
            conv("eco","ncbi-geneid")
            #conversion from KEGG ID to NCBI GI
            conv("ncbi-gi","hsa:10458+ece:Z5100")


        To make it clear by taking another example, you can either convert an
        entire database to another (e.g., from uniprot to KEGG Id all human gene
        IDs)::

            uniprot_ids, kegg_ids = s.conv("hsa", "uniprot")

        or a subset by providing a valid **dbentries**::

            s.conv("hsa","up:Q9BV86+")


        .. warning:: dbentries are not check and are supposed to be correct.
            See :meth:`check_idbentries` to help you checking a dbentries.

        .. warning:: call to this function may be long. conv("hsa", "uniprot") takes a minute
            suprinsingly, conv("uniprot", "hsa") takes just a few seconds.

        .. versionchanged:: 1.1
            the output is now a dictionary, not a list of tuples
        """

        # The second argument may be a source_db or a dbentries so checking
        # second argument is kind of tricky because dbentries take lots of
        # different form.

        # for now, we only check the first argument.
        # gene identifiers
        isOrg = self.isOrganism(target)
        if isOrg is False and target not in ['ncbi-gi', 'ncbi-geneid', 'uniprot', 'pubchem',
                'chebi', 'drug', 'compound', 'glycan']:
                raise ValueError("""
    Invalid syntax. target must be a KEGG ID or
    one of the allowed database. See documentation og :meth:`conv` for
    details""")

        """if target in self.organismIds:
            if source not in ['ncbi-gi', 'ncbi-geneid', 'uniprot']:
                raise ValueError("Invalid source (must be ncbi-gi, ncbi-geneid or uniprot)")
        elif target in ['ncbi-gi', 'ncbi-geneid', 'uniprot']:
            if source not in self.organismIds:
                raise ValueError("Invalid source (must be a valid KEGG organism)")

        # for chenical substance identifiers
        elif target in ['drug', 'compound', 'glycan']:
            if source not in ['chebi', 'pubchem']:
                raise ValueError("Invalid source. Must be chebi or pubchem")
        elif target in ['chebi', 'pubchem']:
            if source not in ['drug', 'compound', 'glycan']:
                raise ValueError("Invalid source. Must be drug, compound or glycan")
        else:
            self.logging.info("arguments not checked")
        """
        url = "conv/"+ target + '/' + source
        res = self.http_get(url, frmt="txt")

        try:
            t = [x.split("\t")[0] for x in res.strip().split("\n")]
            s = [x.split("\t")[1] for x in res.strip().split("\n")]
            return dict([(x,y) for x,y in zip(t, s)])
        except:
            return res

    def link(self, target, source):
        """Find related entries by using database cross-references

        :param str target: the target KEGG database or organism (see below for the list).
        :param str source: the source KEGG database or organism (see below for
            the list) or a valid dbentries involving one of the database; see
            below for details.

        The valid list of databases is pathway, brite, module, disease, drug, environ,
        ko, genome, compound, glycan, reaction, rpair, rclass, enzyme

        ::

            # KEGG pathways linked from each of the human genes
            s.link("pathway", "hsa")
            # human genes linked from each of the KEGG pathways
            s.link("hsa", "pathway")
            # KEGG pathways linked from a human gene and an E. coli O157 gene.
            s.link("pathway", "hsa:10458+ece:Z5100")
        """
        self._checkDB(target, mode="link")

        url = "link/"+ target + '/' + source
        res = self.http_get(url, frmt="txt")
        return res

    def entry(self, dbentries):
        """Retrieve entry

        There is a weblink service (see http://www.genome.jp/kegg/rest/weblink.html)
        Since it is equivalent to :meth:`get`, we do not implement it for now

        """
        raise NotImplementedError("Use :meth:`get` instead")

    def show_pathway(self, pathId, scale=None, dcolor="pink", keggid={}):
        """Show a given pathway inside a web browser

        :param str pathId: a valid pathway Id. See :meth:`pathwayIds`
        :param int scale: you can scale the image with a value between 0 and 100
        :param str dcolor: set the default background color of nodes
        :param dict keggid: set color of entries contained in the pathway as
            key/value pairs; can also be a list, in which case all nodes have
            the same default color (red)

        .. note:: if scale is provided, dcolor and keggid are ignored.

        ::

            # show a pathway in the browser
            s.show_pathway("path:hsa05416", scale=50)

            # Same as above but also highlights some KEGG Ids (red for all)
            s.show_pathway("path:hsa05416", dcolor="white",
                keggid=['1525', '1604', '2534'])

            # You can refine the colors using a dictionary:
            s.show_pathway("path:hsa05416", dcolor="white",
                keggid={'1525':'yellow,red', '1604':'blue,green', '2534':"blue"})


        .. anotherway to higlight: http://www.kegg.jp/pathway/map00010+K00873+C00022
        """
        if pathId.startswith("path:"):
            pathId = pathId.split(":")[1]

        if scale:
            scale = int(scale/100.*100)/100. # just need 2 digits and a value in [0,1]
            url = "http://www.kegg.jp/kegg-bin/show_pathway?scale=" + str(scale)
            url += "&query=&map=" + pathId
        else:
            url = "http://www.kegg.jp/kegg-bin/show_pathway?" + pathId
            if dcolor:
                url += "/default%%3d%s/" % dcolor
            if isinstance(keggid, dict):
                if len(keggid.keys())>0:
                    for k,v in keggid.items():
                        if "," in v:
                            url += "/%s%%09%s/" % (k,v)
                        else:
                            url += "/%s%%09,%s/" % (k,v)
            elif isinstance(keggid, list):
                for k in keggid:
                    url += "/%s%%09,%s/" % (k,"red")

        self.logging.info(url)
        res = webbrowser.open(url)
        return res

    def show_module(self, modId):
        """Show a given module inside a web browser

        :param str modId: a valid module Id. See :meth:`moduleIds`

        Validity of modId is not checked but if wrong the URL will not open a
        proper web page.
        """
        if modId.startswith("md:"):
            modId = modId.split(":")[1]
        url = "http://www.kegg.jp/module/" + modId
        self.logging.info(url)
        res = webbrowser.open(url)
        return res

    # wrapper of all databases to ease access to them (buffered)

    def _get_db(self):
        return KEGG._valid_DB
    databases = property(_get_db, doc="Returns list of valid KEGG databases.")

    def _get_database(self, dbname, mode=0):
        res = self.list(dbname)
        assert mode in [0,1]
        return [x.split()[mode] for x in res.split("\n") if len(x)]

    def _get_organisms(self):
        if self._organisms is None:
            self._organisms = self._get_database("organism", 1)
        return self._organisms
    organismIds = property(_get_organisms, doc="Returns list of organism Ids")

    def _get_reactions(self):
        if self._reaction is None:
            self._reaction = self._get_database("reaction", 0)
        return self._reaction
    reactionIds = property(_get_reactions, doc="returns list of reaction Ids")

    def _get_enzyme(self):
        if self._enzyme is None:
            self._enzyme = self._get_database("enzyme", 0)
        return self._enzyme
    enzymeIds = property(_get_enzyme,
        doc="returns list of enzyme Ids" + _docIds)

    def _get_organisms_tnumbers(self):
        if self._organisms_tnumbers is None:
            self._organisms_tnumbers = self._get_database("organism", 0)
        return self._organisms_tnumbers
    organismTnumbers = property(_get_organisms_tnumbers,
        doc="returns list of organisms (T numbers)" + _docIds)

    def _get_glycans(self):
        if self._glycan is None:
            self._glycan = self._get_database("glycan", 0)
        return self._glycan
    glycanIds = property(_get_glycans,
        doc="Returns list of glycan Ids" + _docIds)

    def _get_brite(self):
        if self._brite is None:
            self._brite = self._get_database("brite", 0)
        return self._brite
    briteIds = property(_get_brite,
        doc="returns list of brite Ids." + _docIds)

    def _get_kos(self):
        if self._ko is None:
            self._ko = self._get_database("ko", 0)
        return self._ko
    koIds = property(_get_kos, doc="returns list of ko Ids" + _docIds)

    def _get_compound(self):
        if self._compound is None:
            self._compound =  self._get_database("compound", 0)
        return self._compound
    compoundIds = property(_get_compound,
        doc="returns list of compound Ids" + _docIds)

    def _get_drug(self):
        if self._drug is None:
            self._drug =  self._get_database("drug", 0)
        return self._drug
    drugIds = property(_get_drug, doc="returns list of drug Ids" + _docIds)

    # set the default organism used by pathways retrieval
    def _get_organism(self):
        return self._organism

    def _set_organism(self, organism):
        if organism in self.organismIds:
            self._organism = organism
            self._pathway = None
            self._module = None
            self._ko = None
            self._glycan = None
            self._compound = None
            self._enzyme = None
            self._drug = None
            self._reaction = None
            self._brite = None
        else:
            self.logging.error("Invalid organism. Check the list in :attr:`organismIds` attribute")
            raise
    organism = property(_get_organism, _set_organism, doc="returns the current default organism ")

    def _get_pathways(self):
        if self.organism is None:
            self.logging.warning("You must set the organism first (e.g., self.organism = 'hsa')")
            return

        if self._pathway is None:
            res = self.http_get("list/pathway/%s" % self.organism, frmt="txt")
            orgs = [x.split()[0] for x in res.split("\n") if len(x)]
            self._pathway = orgs[:]
        return self._pathway
    pathwayIds = property(_get_pathways, doc="""returns list of pathway Ids for the default organism.

    :attr:`organism` must be set.
    ::

        s = KEGG()
        s.organism = "hsa"
        s.pathwayIds

    """)

    def _get_modules(self):
        if self.organism is None:
            self.logging.warning("You must set the organism first (e.g., self.organism = 'hsa')")
            return

        if self._module is None:
            res = self.http_get("list/module/%s" % self.organism)
            orgs = [x.split()[0] for x in res.split("\n") if len(x)]
            self._module = orgs[:]
        return self._module
    moduleIds = property(_get_modules, doc="""returns list of module Ids for the default organism.

    :attr:`organism` must be set.
    ::

        s = KEGG()
        s.organism = "hsa"
        s.moduleIds

    """)

    def lookfor_organism(self, query):
        """Look for a specific organism

        :param str query: your search term. upper and lower cases are ignored
        :return: a list of definition that matches the query
        """
        matches = []
        definitions = [" ".join(x.split()) for x in self.list("organism").split("\n")]
        for i, item in enumerate(definitions):
            if query.lower() in item.lower():
                matches.append(i)
        return [definitions[i] for i in matches]

    def lookfor_pathway(self, query):
        """Look for a specific pathway

        :param str query: your search term. upper and lower cases are ignored
        :return: a list of definition that matches the query
        """
        matches = []
        definitions = [" ".join(x.split()) for x in self.list("pathway").split("\n")]
        for i, item in enumerate(definitions):
            if query.lower() in item.lower():
                matches.append(i)
        return [definitions[i] for i in matches]

    def get_pathway_by_gene(self, gene, organism):
        """Search for pathways that contain a specific gene

        :param str gene: a valid gene Id
        :param str organism: a valid organism (e.g., hsa)
        :return: list of pathway Ids that contain the gene

        ::

            >>> s.get_pathway_by_gene("7535", "hsa")
            ['path:hsa04064', 'path:hsa04650', 'path:hsa04660', 'path:hsa05340']

        """
        res = self.get(":".join([organism, gene]))
        dic = self.parse(res)
        if not isinstance(dic, int) and 'PATHWAY' in dic.keys():
            return dic['PATHWAY']
        else:
            self.logging.info("No pathway found ?")

    def parse_kgml_pathway(self, pathwayId, res=None):
        """Parse the pathway in KGML format and returns a dictionary (relations and entries)

        :param str pathwayId: a valid pathwayId e.g. hsa04660
        :param str res: if you already have the output of the query
            get(pathwayId), you can provide it, otherwise it is queried.
        :return: a dictionary with relations and entries as keys. Values
            of relations is a list of relations, each relation being
            dictionary with entry1, entry2, link, value, name. The
            list os entries is a list of dictionary as well.
            Entry contains contains more details about the entry found in the
            relation. See example below for details.


        ::

            >>> res = s.parse_kgml_pathway("hsa04660")
            >>> set([x['name'] for x in res['relations']])
            >>> res['relations'][-1]
            {'entry1': u'15',
             'entry2': u'13',
             'link': u'PPrel',
             'name': u'phosphorylation',
             'value': u'+p'}

            >>> set([x['link'] for x in res['relations']])
            set([u'PPrel', u'PCrel'])

            >>> # get information about an entry :
            >>> res['entries'][4]


        .. seealso:: `KEGG API <http://www.kegg.jp/kegg/xml/docs/>`_
        """
        output = {'relations':[], 'entries':[]}
        # Fixing bug #24 assembla
        if res is None:
            res = self.easyXML(self.get(pathwayId, "kgml"))
        else:
            res = self.easyXML(res)

        # read and parse the entries
        entries = [x for x in res.findAll("entry")]
        for entry in entries:
            output['entries'].append({
                'id': entry.get("id"),
                'name': entry.get("name"),
                'type': entry.get("type"),
                'link': entry.get("link"),
                'gene_names': entry.find("graphics").get("name")
                })

        relations = [(x.get("entry1"), x.get("entry2"), x.get("type")) for x in res.findAll("relation")]
        subtypes = [x.findAll("subtype") for x in res.findAll("relation")]

        assert len(subtypes) == len(relations)

        for relation, subtype in zip(relations, subtypes):
            if len(subtype)==0: # nothing to do with the species ??? TODO
                pass
            else:
                for this in subtype:
                    value = this.get("value")
                    name = this.get("name")
                    output['relations'].append({
                        'entry1':relation[0],
                        'entry2':relation[1],
                        'link':relation[2],
                       'value':value,
                        'name':name})
        # we need to map back to KEgg IDs...
        return output

    def pathway2sif(self, pathwayId, uniprot=True):
        """Extract protein-protein interaction from KEGG pathway to a SIF format

        .. warning:: experimental Not tested on all pathway. should be move to
            another package such as cellnopt

        :param str pathwayId: a valid pathway Id
        :param bool uniprot: convert to uniprot Id or not (default is True)
        :return: a list of relations (A 1 B) for activation and (A -1 B) for
            inhibitions

        This is longish due to the conversion from KEGGIds to UniProt.

        This method can be useful to provide prior knowledge network to software
        such as CellNOpt (see http://www.cellnopt.org)
        """
        res = self.parse_kgml_pathway(pathwayId)
        sif = []
        for rel in res['relations']:
            # types can be PPrel (protein-protein interaction only
            if rel['link'] != 'PPrel':
                continue
            if rel['name'] == 'activation':
                Id1 = rel['entry1']
                Id2 = rel['entry2']
                name1 = res['entries'][[x['id'] for x in res['entries']].index(Id1)]['name']
                name2 = res['entries'][[x['id'] for x in res['entries']].index(Id2)]['name']
                type1 = res['entries'][[x['id'] for x in res['entries']].index(Id1)]['type']
                type2 = res['entries'][[x['id'] for x in res['entries']].index(Id2)]['type']
                #print("names:", rel, name1, name2)
                #print(type1, type2)
                if type1!='gene' or type2!='gene':
                    continue
                if uniprot:
                    try:
                        # FIXME  sometimes, there are more than one name
                        name1 = name1.split()[0]
                        name2 = name2.split()[0]
                        name1 = self.conv("uniprot", name1)[name1]
                        name2 = self.conv("uniprot", name2)[name2]
                    except Exception:
                        self.logging.info(name1)
                        self.logging.info(name2)
                        raise Exception
                #print(name1, 1, name2)
                sif.append([name1, 1, name2])
            elif  rel['name'] == 'inhibition':
                Id1 = rel['entry1']
                Id2 = rel['entry2']
                name1 = res['entries'][[x['id'] for x in res['entries']].index(Id1)]['name']
                name2 = res['entries'][[x['id'] for x in res['entries']].index(Id2)]['name']
                type1 = res['entries'][[x['id'] for x in res['entries']].index(Id1)]['type']
                type2 = res['entries'][[x['id'] for x in res['entries']].index(Id2)]['type']
                #print("names:", rel, name1, name2)
                #print(type1, type2)
                if type1!='gene' or type2!='gene':
                    continue
                if uniprot:
                    name1 = name1.split()[0]
                    name2 = name2.split()[0]
                    name1 = self.conv("uniprot", name1)[name1]
                    name2 = self.conv("uniprot", name2)[name2]
                #print(name1, -1, name2)
                sif.append([name1, -1, name2])
            else:
                pass
                #print("#", rel['entry1'], rel['name'], rel['entry2'])

        return sif

    def parse(self, entry):
        """See :class:`KEGGParser` for details

        Parse entry returned by :meth:`get`

        ::

            k = KEGG()
            res = k.get("hsa04150")
            d = k.parse(res)

        """
        parse = dict()

        try:
            parse = self.keggParser.parse(entry)
        except:
            self.logging.warning('Could not parse the entry correctly.')

        return parse


class KEGGParser(object):
    """This is an extension of the :class:`KEGG` class to ease parsing of dbentries

    This class provides a generic method :meth:`parse` that will read the output
    of a dbentry returned by :meth:`KEGG.get` and converts it into a dictionary ready to use.

    The :meth:`parse` method parses any entry. It can be a pathway, a gene, a compound...
    ::

        from bioservices import *
        s = KEGG()

        # Retrieve a KEGG entry
        res = s.get("hsa04150")

        # parse it
        d = s.parse(res)

    As a pedagogical example, you can then further process this dictionary. Here below, we convert
    the gene Ids found in the pathway into UniProt Ids::

        # Get the KEGG Ids in the pathway
        kegg_geneIds = [x for x in d['GENE']]

        # Convert them
        db_up, db_kegg = s.conv("hsa", "uniprot")

        # Get the corresponding uniprot Ids
        indices = [db_kegg.index("hsa:%s" % x ) for x in kegg_geneIds]
        uniprot_geneIds = [db_up[x] for x in indices]

    However, you could also have done it simply as follows::

        kegg_geneIds = [x for x in d['gene']]
        uprot_geneIds = [s.parse(s.get("hsa:"+str(e)))['DBLINKS']["UniProt:"] for e in d['GENE']]

    .. note:: The 2 outputs are slightly different.

    .. seealso:: http://www.kegg.jp/kegg/rest/dbentry.html
    """
    def __init__(self, verbose=False):
        super(KEGGParser, self).__init__()
        self.logging = Logging("bioservices:keggparser", verbose)

    def parse(self, res):
        """Parse to any outputs returned by :meth:`KEGG.get`

        :param str res: output of a :meth:`KEGG.get`.
        :return: a dictionary. Keys are those found in the KEGG entry (e.g.,
            REACTION, ENTRY, EQUATION, ...). The format of each value is
            various. It could be a string, a list (of strings generally),
            a dictionary, a float depending on the key. Depdending on
            the type of the entry (e.g., module, pathway), the
            type of the value may also differ (e.g., REACTION can be either
            a list of reactions or a dictionary depending on the content)


        ::

            >>> # Parses a drug entry
            >>> res = s.get("dr:D00001")
            >>> # Parses a pathway entry
            >>> res = s.get("path:hsa10584")
            >>> # Parses a module entry
            >>> res = s.get("md:hsa_M00554")
            >>> # Parses a disease entry
            >>> res = s.get("ds:H00001")
            >>> # Parses a environ entry
            >>> res = s.get("ev:E00001")
            >>> # Parses Orthology entry
            >>> res = s.get("ko:K00001")
            >>> # Parses a Genome entry
            >>> res = s.get('genome:T00001')
            >>> # Parses a gene entry
            >>> res = s.get("hsa:1525")
            >>> # Parses a compound entry
            >>> s.get("cpd:C00001")
            >>> # Parses a glycan entry
            >>> res = s.get("gl:G00001")
            >>> # Parses a reaction entry
            >>> res = s.get("rn:R00001")
            >>> # Parses a rpair entry
            >>> res = s.get("rp:RP00001")
            >>> # Parses a rclass entry
            >>> res = s.get("rc:RC00001")
            >>> # Parses an enzyme entry
            >>> res = s.get('ec:1.1.1.1')

            >>> d = s.parse(res)
        """
        parser = dict()
        dbentry = "?"

        # get() should return a large amount of text data
        if not res or not isinstance(res, str):
            raise ValueError("Unexpected input, unable to parse data of type %s" % type(res))

        if res[:5] == "ENTRY":
            dbentry = res.split("\n")[0].split(None, 2)[2]
        else:
            raise ValueError("Unable to parse data, it does not comform to the expected KEGG format")

        try:
            parser = self._parse(res)
        except Exception as err:
            self.logging.warning("Could not parse the entry %s correctly" % dbentry)
            self.logging.warning(err)

        return parser

    def _parse(self, res):
        keys = [x.split(" ")[0] for x in res.split("\n") if len(x) and x[0]!=" "
                and x!="///"]
        # let us go line by to not forget anything and know which entries are
        # found in the RHS. We may have duplicated once as can be found in th
        # keys variable as well.
        entries = []
        entry = ""
        start = True
        for line in res.split("\n"):
            if line == '///':
                entries.append(entry)
            elif len(line) == 0:
                pass
            elif line[0] != " ":
                if start == True:
                    start = False
                else:
                    entries.append(entry)
                entry = line[:]
            else:
                entry+= "\n"+line[:]

        # we can now look at each entry and create a dictionary.
        # The dictionary will contain as key the name found in the LHS
        # e.g., REACTION and the value will be either the entry content
        # as a string or a list of strings if the key is not unique
        # e.g., for references. This could be a bit annoying since
        # for example References could appear only once if some cases.
        # This can be tested though by checking the type
        output = {}
        for entry in entries:
            name = entry.split("\n")[0].split()[0]
            if keys.count(name) == 1:
                output[name] = entry[:]
            else:
                if name in output.keys():
                    output[name].append(entry[:])
                else:
                    output[name] = [entry[:]]
        # remove name that are now the keys of the dictionary anyway
        # if the values is not a list
        for k, v in output.items():
            if k in ['CHROMOSOME', 'TAXONOMY']:
                continue
            try:
                output[k] = output[k].strip().replace(k, '', 1).strip()
            except: # skip the lists
                pass


        # Now, let us do the real stuff.
        # This is tricky since format is not consistent with the names e,g
        # REACTIONS could be sometimes a list of names and sometimes list
        # of reactions with their description.
        self.raw_parsing = copy.deepcopy(output)

        for key, value in output.items():
            # convert to a dict
            if key == 'STATISTICS':
                data = [x.split(":",1) for x in output[key].split("\n")]
                data = dict([(x[0].strip(), float(x[1].strip())) for x in data])
                output[key] = data
            # strip only: expecting a single line (string)
            elif key in ['POSITION', 'DESCRIPTION', 'ENTRY', 'ORGANISM',
                    'CLASS', 'FORMULA', 'KEYWORDS', 'CATEGORY', 'ANNOTATION', 
                    'DATA_SOURCE', 'MASS', 'COMPOSITION', 'DEFINITION',
                    'KO_PATHWAY', 'EQUATION', 'TYPE', 'RCLASS', 'SYSNAME', "HISTORY",
                    "REL_PATHWAY"]:
                # get rid of \n

                if "\n" in value:
                    # happens in description path:hsa04915
                    value = value.replace("\n", " ")
                # nothing to do here except strip
                output[key] = value.strip()
            # list : set of lines. Could be split by ; character but we use the
            # \n instead to be sure
            # COMMENT is sometimes on several lines
            elif key in ['NAME', 'REMARK', 'ACTIVITY', 'COMMENT', 'ORIGINAL_DB',
                "SUBSTRATE", "PRODUCT", "ENV_FACTOR", "PATHOGEN"]:
                output[key] = [x.strip() for x in value.split("\n")]
            # list: long string splitted into items and therefore converted to a list
            elif key in ['ENZYME', 'REACTION',  'RPAIR', 'RELATEDPAIR',
                  "ALL_REAC"]:
                # RPAIR/rn:R00005 should be a dict if "_" found
                # REACTION/md:hsa_M00554 should be a dict if '->' found
                if '->' in value or "_" in value:
                    kp = {}
                    for line in value.split("\n"):
                        try:
                            k,v = line.strip().split(None,1)
                        except:
                            #self.logging.warning("empty line in %s %s" % (key, line))
                            k = line.strip()
                            v = ''
                        kp[k] = v
                    output[key] = kp.copy()
                else:
                    output[key] = [x.strip() for x in value.split()]
            # transform to dictionary by splitting line and then split on ";"
            elif key in "GENE":
                kp = {}
                for i, line in enumerate(value.split("\n")):
                    try:
                        k, v = line.strip().split(None, 1)
                    except:
                        #self.logging.warning("empty line in %s %s" % (key, line))
                        k = line.strip()
                        v = ''
                    kp[k] = v
                output[key] = kp.copy()

            elif key in ['DRUG', 'ORTHOLOGY', 'COMPOUND', 'RMODULE',
                    'DISEASE', 'PATHWAY_MAP', 'STR_MAP', 'OTHER_MAP',
                    'PATHWAY', 'MODULE', 'GENES']:
                kp = {}
                for line in value.split("\n"):
                    try: # empty orthology in rc:RC00004
                        k,v = line.strip().split(None,1)
                    except:
                        #self.logging.warning("empty line in %s %s" % (key, line))
                        k = line.strip()
                        v = ''
                    if k.endswith(":"):
                        k = k.strip().rstrip(":")
                    kp[k] = v
                output[key] = kp.copy()
            # list of dictionaries
            elif key == 'REFERENCE':
                # transform to a list since you may have several entries
                newvalue = [self._interpret_references(this)
                        for this in self._tolist(value)]
                output[key] = newvalue
            # list of dictionaries
            elif key == 'PLASMID':
                newvalue = [self._interpret_plasmid(this)
                        for this in self._tolist(value)]
                output[key] = newvalue
            # list of dictionaries
            elif key == 'CHROMOSOME':
                newvalue = [self._interpret_chromosome(this)
                    for this in self._tolist(value)]
                output[key] = newvalue
            # list of dictionaries
            elif key == 'TAXONOMY':
                newvalue = [self._interpret_taxonomy(this)
                    for this in self._tolist(value)]
                output[key] = newvalue
            elif key == "SEQUENCE":
                newvalue = [self._interpret_sequence(this)
                    for this in self._tolist(value)]
                output[key] = newvalue

            # dictionary, interpreted as follows
            # on each line, there is an identifier followed by : character
            # looks like there is just one line...
            elif key in ['DRUG_TARGET', 'STRUCTURE', 'MOTIF']:
                # STRUCTURE PDB can be long and span over several lines. e.g.,
                # hsa:1525
                new = {}
                import re
                value = re.sub("\n {6,20}", " ", value)
                for line in value.split("\n"):
                    thiskey, content = line.split(None, 1)
                    if thiskey.endswith(":"):
                        new[thiskey[:-1]] = content
                    else:
                        self.logging.warning("Could not fully interpret %s " % key )
                output[key] = new
            elif key in ['DBLINKS', 'INTERACTION', 'METABOLISM']:
                # D01441 for metabolism
                # DBLINKS for C00624 should work out of the box
                new = {}
                import re
                value = re.sub("\n {12,12+1}", "\n", value)
                # Sometimes (e.g. INTERACTION in D00136) values are empty
                # see issue #85
                if len(value):
                    for line in value.split("\n"):
                        thiskey, content = line.strip().split(":", 1)
                        new[thiskey] = content.strip()
                    output[key] = new
                else:
                    output[key] = value
            # floats
            elif key in ['EXACT_MASS', 'MOL_WEIGHT']:
                output[key] = float(value)
            # get rid of the length
            elif key in ['AASEQ', 'NTSEQ']:
                output[key] = value.split("\n", 1)[1].replace("\n","")
            elif key.startswith("ENTRY"):
                newvalue = self._interpret_entry(value)
                output[key] = newvalue.strip()
            # extract list of strings from structure. These strings are not 
            # interpreted
            elif key in ['ATOM', 'BOND', 'NODE', 'EDGE', 'ALIGN', 'RDM']:
                # starts with a number that defines number of entries. Let us
                # get rid of that number and then send a list
                output[key] = self._interpret_enumeration(output[key])
            # not interpreted
            elif key in ['BRACKET', 'COMPONENT', 'SOURCE', 'BRITE', "TARGET",
                    'CARCINOGEN', 'MARKER']: # do not interpret to keep structure
                pass
            else:
                self.logging.warning("""Found keyword %s, which has not special
    parsing for now. please report this issue with the KEGG 
    identifier (%s) into github.com/bioservices. Thanks T.C.""" % (key,output['ENTRY']))


        return output

    def _interpret_enumeration(self, data):
        N = data.strip().split("\n")[0]
        # must be a number
        N = int(N)
        lines = data.strip().split("\n")[1:]
        lines = [line.strip() for line in lines]
        if len(lines) != N:
            self.logging.warning('number of lines not as expected in %s' % data)

        if N == 0:
            return []
        else:
            return lines

    def _tolist(self, value):
        # transform to a list since you may have several entries
        if isinstance(value, list) is False:
            value = [value]
        return value

    def _interpret_entry(self, data):
        res = {}

        for this in data.split("\n"):
            if this.strip().startswith("ENTRY"):
                pass
            elif this.strip().startswith("COMPOUND"):
                res['COMPOUND'] = this.strip().split(None,1)[1]
            elif this.strip().startswith("ATOM"):
                res['AUTHORS'] = this.strip().split(None,1)[1]
            elif this.strip().startswith("TITLE"):
                res['TITLE'] = this.strip().split(None,1)[1]
        return res

    def _interpret_sequence(self, data):
        fields = ["GENE", "ORGANISM", "TYPE"]
        current_field = "SEQUENCE"
        res = dict({current_field: u""})

        for this in data.split("\n"):
            this = this.strip()
            for f in fields:
                if this.startswith(f):
                    fields.remove(f)
                    current_field = f
                    this = this.split(None, 1)[1]
                    res[current_field] = ''
                    break
            res[current_field] += this + u' '
        if res['SEQUENCE'] == u'':
            res.pop('SEQUENCE')
        for k in res.keys():
            res[k] = res[k].strip()
        return res
        # changed v1.4.18 issue #79
        """for this in data.split("\n"):
            if this.strip().startswith("GENE"):
                res['GENE'] = this.strip().split(None,1)[1]
                gene = True
            elif this.strip().startswith('ORGANISM'):
                res['ORGANISM'] = this.strip().split(None,1)[1]
                gene = False
            elif this.strip().startswith('TYPE'):
                res['TYPE'] = this.strip().split(None, 1)[1]
                gene = False
            elif gene is True:
                res['GENE'] += this.strip()
            else:
                assert gene is False
                res['SEQUENCE'] = this.strip()
        return res
        """
    """
    [u'    0 Mtk  1 Mtd  2 Mad  3 Man  4 Mte  5 Mtk  6 Mtd  7 Man',
 u'  GENE      0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII',
 u'            [UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]',
 u'  ORGANISM  Micromonospora griseorubida',
 u'  TYPE      PK']
    """



    def _interpret_taxonomy(self, data):
        res = {}
        for this in data.split("\n"):
            if this.strip().startswith("TAXONOMY"):
                res['TAXONOMY'] = this.strip()
            elif this.strip().startswith('LINEAGE'):
                res['LINEAGE'] = this.strip().split(None,1)[1]
        return res

    def _interpret_references(self, data):
        res = {}
        for this in data.split("\n"):
            fields = this.strip().split(None, 1)
            if len(fields) <= 1: continue
            if fields[0] in ("REFERENCE", "AUTHORS", "TITLE", "JOURNAL"):
                res[fields[0]] = fields[1]

            #if this.strip().startswith("REFERENCE"):
            #    res['REFERENCE'] = this.strip().split(None,1)[1]
            #elif this.strip().startswith("JOURNAL"):
            #    res['JOURNAL'] = this.strip().split(None,1)[1]
            #elif this.strip().startswith("AUTHORS"):
            #    res['AUTHORS'] = this.strip().split(None,1)[1]
            #elif this.strip().startswith("TITLE"):
            #    res['TITLE'] = this.strip().split(None,1)[1]
        return res

    def _interpret_plasmid(self, data):
        res = {}
        for this in data.split("\n"):
            if this.strip().startswith("PLASMID"):
                res['PLASMID'] = this.strip().split(None,1)[1]
            elif this.strip().startswith("LENGTH"):
                res['LENGTH'] = this.strip().split(None,1)[1]
            elif this.strip().startswith("SEQUENCE"):
                res['SEQUENCE'] = this.strip().split(None,1)[1]
        return res

    def _interpret_chromosome(self, data):
        res = {}
        for this in data.split("\n"):
            if this.strip().startswith("CHROMOSOME"):
                try:
                    res['CHROMOSOME'] = this.strip().split(None,1)[1]
                except:
                    #genome:T00012 has no name
                    res['CHROMOSOME'] = this.strip()
            elif this.strip().startswith("LENGTH"):
                res['LENGTH'] = this.strip().split(None,1)[1]
            elif this.strip().startswith("SEQUENCE"):
                res['SEQUENCE'] = this.strip().split(None,1)[1]
        return res




class KEGGTools(KEGG):
    """Load all genes from the database.

    ::

        k = kegg.KEGGTools()
        k.load_genes("hsa")
        genes = k.scan_genes()


    """
    def __init__(self, verbose=False, organism="hsa"):
        self.kegg = KEGG()
        self.parser = KEGGParser()
        self.kegg.logging.info("Initialisation. Please wait")
        self.load_genes(organism)

    def load_genes(self, organism):
        res = self.kegg.list(organism)
        self.genes =  [x.split("\t")[0] for x in res.strip().split("\n")]
        return self.genes

    def scan_genes(self):
        from easydev import progress_bar
        pb = progress_bar(len(self.genes))
        import time
        genes = {}
        for i, gene in enumerate(self.genes):
            genes[gene] = self.parser.parse(self.kegg.get(self.genes[i]))
            pb.animate(i, time.time()-pb.start)
        return genes

    def load_reactions(self, organism):
        reactions = self.kegg.list('reaction')
        self.reactions = [x.split()[0] for x in reactions.split("\n") if len(x)]
        return self.reactions

    def scan_reactions(self):
        from easydev import progress_bar
        pb = progress_bar(len(self.reactions))
        import time
        reactions = {}
        for i, this in enumerate(self.reactions):
            reactions[this] = self.parser.parse(self.kegg.get(self.reactions[i]))
            pb.animate(i, time.time()-pb.start)
        return reactions



